Load packages

library(tidyverse) ## Always load tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext) ## Needed for text analysis
library(data.table) ## Faster reading and writing of large files 
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(plotly) ## Interactive plots
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Load in clean dataframes you prepared

## Set curent directory as our path
path <- here::here()

## Read clean dataframes
#sephora_tokens <- fread(paste0(path, "/data/sephora_tokens.csv"))
#sephora_token_counts <- fread(paste0(path, "/data/sephora_token_counts.csv"))
sephora_token_counts_year <- fread(paste0(path, "/data/sephora_token_counts_year.csv"))
sephora_token_counts_category <- fread(paste0(path, "/data/sephora_token_counts_category.csv"))

Which words are associated with higher or lower ratings?

Note to self, remember to collapse accross years

sephora_token_counts_category %>%
  anti_join(get_stopwords()) %>%
  filter(str_detect(word, "[A-Za-z]")) %>% 
  filter(nchar(word) > 1) %>% 
  filter(n > 30) %>% 
  ggplot(aes(n, average_rating)) +
  geom_hline(yintercept = 2.63, lty = 2, color = "grey", linewidth = 0.5) +
  geom_text(aes(label = word, color = secondary_category), check_overlap = TRUE, show.legend = FALSE, vjust = "top", hjust = "left") +
  scale_x_log10(labels = function(x) format(x, scientific = FALSE)) +
  facet_wrap(vars(secondary_category))+
  theme_classic()
## Joining with `by = join_by(word)`

Exploring word use trajectories across year

Work in progress, decided to change my approach

sephora_token_counts_year %>% 
  ggplot(aes(x = year, y = average_rating, group = word)) +
  geom_smooth(method = "loess", se = FALSE, linewidth = 0.2) +  # Fit loess lines
  labs(x = "Year", y = "Score") +
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'

sephora_token_counts_year %>% 
  ggplot(aes(x = year, y = average_rating, group = word)) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 0.2) +
  labs(x = "Year", y = "Score") +
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'

Fitting relationship between word use over time and product rating

linear_slopes <- sephora_token_counts_category %>%
  group_by(word, secondary_category) %>%
  do(model = lm(average_rating ~ year, data = .)) %>%
  filter(!is.null(model)) %>%
  summarize(
    word = word[1],
    secondary_category = secondary_category[1],
    slope = coef(model)[2],         # Extract the slope coefficient
    intercept = coef(model)[1],     # Extract the intercept coefficient
    significance = ifelse(nrow(summary(model)$coefficients) >= 2 & ncol(summary(model)$coefficients) >= 4,
                          summary(model)$coefficients[2, 4], NA)  # Extract the p-value for the slope if available, otherwise NA
  ) %>%
  ungroup()
## Warning: There were 3 warnings in `summarize()`.
## The first warning was:
## ℹ In argument: `significance = ifelse(...)`.
## ℹ In row 14737.
## Caused by warning in `summary.lm()`:
## ! essentially perfect fit: summary may be unreliable
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.

Which words have most substantial change in associated rating over time per group?

linear_slopes %>% 
  filter(significance < 0.05) %>% 
  filter(!(word %in% c("opt", "vit", "sat", "co"))) %>% 
  group_by(secondary_category) %>% 
  slice_max(abs(slope), n = 12) %>%
  ungroup() %>%
  ggplot(aes(slope, fct_reorder(word, slope), fill = slope > 0)) +
  facet_wrap(vars(secondary_category), scales = "free")+
  labs(y = "keyword")+
  scale_fill_discrete(name = "Associated with...", labels = c("WORSE ratings over time", "BETTER ratings over time")) +
  geom_col(alpha = 0.8) +
  theme_classic()

Plotting words with substantial change in associated rating over time

Note to self, work on splitting legend labels by facets, and maybe pulling only adjectives Starting point: https://stackoverflow.com/questions/14840542/place-a-legend-for-each-facet-wrap-grid-in-ggplot2

linear_slopes_significant_words <- linear_slopes %>% 
  filter(significance < 0.05) %>% 
  filter(!(word %in% c("opt", "vit", "sat", "co"))) %>% 
  group_by(secondary_category) %>% 
  slice_max(abs(slope), n = 12) %>%
  ungroup() %>%
  select(word, secondary_category)

token_trajectories_lm <- sephora_token_counts_category %>%
  right_join(linear_slopes_significant_words, by = c("word", "secondary_category")) %>% 
  ggplot(aes(x = year, y = average_rating, group = word, color = word)) +
  geom_smooth(method = "lm", se = FALSE) +  
  labs(x = "Year", y = "Score") +
  facet_wrap(vars(secondary_category))+
  theme_classic()

token_trajectories_loess <- sephora_token_counts_category %>% 
  right_join(linear_slopes_significant_words, by = c("word", "secondary_category")) %>% 
  ggplot(aes(x = year, y = average_rating, group = word, color = word)) +
  geom_smooth(method = "loess", se = FALSE) +  # Fit loess lines
  labs(x = "Year", y = "Score") +
  facet_wrap(vars(secondary_category))+
  theme_classic() 

ggplotly(token_trajectories_lm)
## `geom_smooth()` using formula = 'y ~ x'
ggplotly(token_trajectories_loess)
## `geom_smooth()` using formula = 'y ~ x'

In future, add table that pulls example s how these words are used positively or negatively over time